%Thomas Ng
\documentclass[11pt]{article}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{float}

\newcommand{\fignum}[1]{Fig~\ref{fig:#1}}
\DeclareMathOperator*{\argmax}{argmax}

\textwidth=6.5in
\textheight=9in
\oddsidemargin=0in
\evensidemargin=0in
\topmargin=0in
\topskip=0in
\headheight=0in
\headsep=0in


\makeatletter
\def\@maketitle{%
  \newpage
  \begin{center}%
  \let \footnote \thanks
    {\LARGE \@title \par}%
    \vskip 1.5em%
    {\large
      \lineskip .5em%
      \begin{tabular}[t]{c}%
        \@author
      \end{tabular}\par}%
    \vskip 1em%
    {\large \@date}%
  \end{center}%
  \par
  \vskip 1.5em}
\makeatother

\author{Thomas Ng, Eric Anderson}
\title{Brainstorm on allelic frequencies estimation based on NGS}
\date{\today}
\begin{document}
\maketitle
Given $N$ diploid individuals in a NGS experiment, determing the frequency $\rho$ of the SNPs occured in a population is of interest. A reference allele is denoted as "1"; "0" - otherwise. For the $i^{th}$ individual, the unobserved genotype $z_i$, where $z_i \in \{0,1,2\}$, counts the number of "1" alleles that individual $i$ carries. Let $r_i$ ( \textgreater\ 0) be the total number of $i^{\text{th}}$ individual reads containing the SNP and $y_i$ be the number of reads from $i$ having the reference allele - "1". Assume that $y_i$ is binomially distributed with $r_i$ trials and the success probability of each case is set to $\frac{z_i}{2}$. \\\\
Ideally, individuals are randomly sampled from a population. Sequencing errors (SNP locality in reference to the read length and read clonality) and alignment scores are not considered in this general case but will be included in the future. \\
\\
Additional Notation: \\
$z_i = ( z_i^{0}, z_i^{1}, z_i^{2} )$, where $z_i^{j}$ = 1 of the $i^{th}$ individual has j copies of the reference allele and otherwise \\ 
\\Outline: \\
1. Determine the likelihood of $p$ given $Z = \{z_1, z_2, ..., z_N\}, R = \{r_1, r_2, ...,r_N\}$ and $Y = \{y_1, y_2, ..., y_N\} $ \\
\begin{eqnarray}
 p (Z, Y| \rho ) &=& \prod\limits_{i=1}^N p(y_i|z_i,\rho) p(z_i, \rho)  \nonumber \\
&=& \prod\limits_{i=1}^N \sum\limits_{j=0}^2 p(y_i|z_i^j) p(z_i^j| \rho) \nonumber \\
%&=& \prod\limits_{i=1}^N (\ p(y_i|z_i^1)\ p(z_i = (1,0,0)) + p(y_i|z_i^2)\ p(z_i= (0,1,0)) + p(y_i|z_i^3)\ p(z_i = (0,0,1))\ ) \ \nonumber \\
&=& \prod\limits_{i=1}^N \sum\limits_{j=0}^2 {r_i \choose y_i}{(\frac{z_i}{2})}^{y_i} {(1-\frac{z_i}{2})}^{r_i - y_i} p(z_i^j|\rho) \nonumber \\
&=& \prod\limits_{i=1}^N\ \begin{cases} ( 1 - \rho )^2 *\ \ \begin{cases}0\ \text{if } y_i > 0\\
1\ \text{if otherwise,}  \end{cases} \text{ and } z_i = 0 \\\\
{r_i \choose y_i}(1/2)^{r_i}\ 2\rho(1-\rho)\ \text{if}\ z_i = 1,  \\\\
\rho^2\ *\ \begin{cases} 0 \text{ if } r_i>y_i\\
1 \text{ if } y_i = r_i\end{cases} \text{ and }\ z_i = 2 \end{cases} \nonumber
\end{eqnarray}
2. Write the EM algorithm to find the MLE of $\rho$ from the likelihood of $\rho$.  \\
The E step: \\
\begin{eqnarray}
p ( z_i=j | y_i, \rho) &=& \frac{p(z_i=j|\rho)\ p(y_i|z_i=j)} {\sum\limits_{k=0}^2 p(z_i=k|\rho)\ p(y_i|z_i=k) } \nonumber
\end{eqnarray}
The M step: \\
Let $\delta_j(z) =\ \{\ 1\ if\ z = j,\ 0\ if\ z \ne j \} $, M steps consists of maximizing the $\rho $ parameter :
\begin{align*}
\hat{\rho} \hspace{5mm} &= \argmax\limits_{\rho}\ E(\ \prod\limits_{i=1}^N \sum\limits_{j=0}^2 \delta_j(z_i)\ p(y_i|z_i^j)\ p(z_i^j| \rho) ) \nonumber \\
&= \argmax\limits_{\rho}\  E(\ \sum\limits_{i=1}^N \sum\limits_{j=0}^2 \delta_j(z_i)\ log (\ p(y_i|z_i^j)+p(z_i^j| \rho) ) ) \nonumber \\
&=  \argmax\limits_{\rho}\ \sum\limits_{i=1}^N \sum\limits_{j=0}^2 E(\delta_j(z_i))\ log(\ p(y_i|z_i^j)+p(z_i^j| \rho) ) \nonumber \\
&= \argmax\limits_{\rho}\ \sum\limits_{i=1}^N \sum\limits_{j=0}^2 q^t(z_i=j)\ log\ p(z_i^j| \rho)  \nonumber
\end{align*}
Let $Q_0$, $Q_1$, $Q_2$ be the value of $\sum\limits_{i=1}^N q^t(z_i=0)$, $\sum\limits_{i=1}^N q^t(z_i=1)$, and $\sum\limits_{i=1}^N q^t(z_i=2)$ respectively \\
\begin{align*}
%&=& \argmax\limits_{\rho}\ \sum\limits_{i=1}^N \sum\limits_{j=0}^2 \ 0\ \text{for $j$ = 0 or 2; } ( {r_i} \text{ log } {r_i \choose y_i}(1/2)+\text{ log } 2\rho(1-\rho)\ \text{for $j$ = 1})  \nonumber
\hat{\rho} \hspace{5mm} &= \argmax\limits_{\rho}\ Q_o \text{ log } (1-\rho)^2\ +\ Q_1 \text{ log } 2 \rho (1-\rho)\ +\ Q_2 \text{ log } \rho^{2} \nonumber
\end{align*}
Set $\frac{d}{d\rho}\ ( 2Q_o \text{ log} (1-\rho)\ +\ Q_1 \text{ log}(2\rho (1-\rho))\ +\ 2Q_2 \text{ log } \rho\ ) = 0.$ \\
\begin{align*}
0&= \frac{-2Q_o}{1-\rho} + \frac{Q_1(2-4\rho) }{2\rho (1-\rho)} + \frac{2Q_2}{\rho} \\
\rho &= \frac{0.5Q_1 + Q_2}{Q_0+Q_1+Q_2}
\end{align*}
\end{document}